{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "from selenium.webdriver.support.wait import WebDriverWait\n",
    "from selenium.webdriver.support import expected_conditions as EC\n",
    "from selenium.webdriver.common.by import By "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help on package selenium:\n",
      "\n",
      "NAME\n",
      "    selenium\n",
      "\n",
      "DESCRIPTION\n",
      "    # Licensed to the Software Freedom Conservancy (SFC) under one\n",
      "    # or more contributor license agreements.  See the NOTICE file\n",
      "    # distributed with this work for additional information\n",
      "    # regarding copyright ownership.  The SFC licenses this file\n",
      "    # to you under the Apache License, Version 2.0 (the\n",
      "    # \"License\"); you may not use this file except in compliance\n",
      "    # with the License.  You may obtain a copy of the License at\n",
      "    #\n",
      "    #   http://www.apache.org/licenses/LICENSE-2.0\n",
      "    #\n",
      "    # Unless required by applicable law or agreed to in writing,\n",
      "    # software distributed under the License is distributed on an\n",
      "    # \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n",
      "    # KIND, either express or implied.  See the License for the\n",
      "    # specific language governing permissions and limitations\n",
      "    # under the License.\n",
      "\n",
      "PACKAGE CONTENTS\n",
      "    common (package)\n",
      "    webdriver (package)\n",
      "\n",
      "VERSION\n",
      "    3.141.0\n",
      "\n",
      "FILE\n",
      "    c:\\programdata\\anaconda3\\lib\\site-packages\\selenium\\__init__.py\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import selenium\n",
    "help(selenium)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:8: DeprecationWarning: use options instead of chrome_options\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "chrome_options = webdriver.ChromeOptions()\n",
    "\n",
    "chrome_options.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = chrome_options) #desired_capabilities=caps,\n",
    "driver.get('https://mp.weixin.qq.com') #打开微信公众号平台"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 填表登入"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "payload =  {\"account\": \"1910875700@qq.com\", \"password\": \"EXOexo12\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 切换为账号密码登录\n",
    "driver.find_element_by_xpath('//div[@class=\"login__type__container login__type__container__scan\"]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 清空账号input\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"account\"]').clear()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"account\"]').send_keys(payload['account'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "## 清空密码input\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').clear()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').send_keys(payload['password'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//div[@class=\"login_btn_panel\"]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.refresh()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "#找选单"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"mp_header\"]/div/div/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 点击图文素材\n",
    "element = driver.find_element_by_xpath('//*[@id=\"js_mp_sidemenu\"]/div/div/ul/li[2]/ul/li[1]/ul/li[1]/a/span')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 点击+\n",
    "element = driver.find_element_by_xpath('//*[@id=\"js_main\"]/div[3]/div[2]/div/div/div/div[1]/div/div[1]/div[1]/i')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 新建图文消息\n",
    "element = driver.find_element_by_xpath('//*[@id=\"js_main\"]/div[3]/div[2]/div/div/div/div[1]/div/div[1]/div[2]/ul/li[1]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['CDwindow-ADA9AA5778E640E9DE2E80671C0A8C45', 'CDwindow-E586186FF4804916C400D5C0D11382D8']\n"
     ]
    }
   ],
   "source": [
    "# 检查窗口信息\n",
    "print (driver.window_handles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "# 跳转窗口\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 点击超链接\n",
    "element = driver.find_element_by_xpath('//*[@id=\"js_editor_insertlink\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 点击选择其他公众号\n",
    "element = driver.find_element_by_xpath('//*[@id=\"vue_app\"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/p/div')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 输入公众号名称\n",
    "driver.find_element_by_xpath('//*[@id=\"vue_app\"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/div/div/span/input').clear()\n",
    "driver.find_element_by_xpath('//*[@id=\"vue_app\"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/div/div/span/input').send_keys('黑星俱樂部')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-icon weui-desktop-icon__search weui-desktop-icon__small\" style=\"width: 20px; height: 20px;\"><!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!---->     <svg viewBox=\"0 0 24 24\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><title>MP/Icon/Search</title> <g id=\"MP/Icon/Search\" stroke=\"none\" stroke-width=\"1\" fill=\"none\" fill-rule=\"evenodd\"><path d=\"M5.78025253,5.78248558 C8.51392257,3.04881554 12.9460774,3.04881554 15.6797475,5.78248558 C18.1730922,8.27583028 18.3922898,12.1821488 16.3373403,14.9239313 L20.6294949,19.2175144 L19.2152814,20.631728 L14.922508,16.3389663 C12.180685,18.394566 8.27384272,18.1755707 5.78025253,15.6819805 C3.04658249,12.9483105 3.04658249,8.51615562 5.78025253,5.78248558 Z M6.8409127,6.84314575 C4.6930291,8.99102935 4.6930291,12.4734367 6.8409127,14.6213203 C8.98879631,16.7692039 12.4712037,16.7692039 14.6190873,14.6213203 C16.7669709,12.4734367 16.7669709,8.99102935 14.6190873,6.84314575 C12.4712037,4.69526215 8.98879631,4.69526215 6.8409127,6.84314575 Z\" id=\"形状\"></path></g></svg> <!----> <!----> <!----> <!----> <!----></div>\n"
     ]
    }
   ],
   "source": [
    "# 点“放大镜”图标进行搜索\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div/div/div/div[6]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/div/div/span/span/button[2]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/ULYj6QDI436mWhatIL7THNaMv60QdmVamV1MIkQmk2E7GOeRaJjpa78VG48HarDet7YsuUaWINQuWVbtMEwk4w/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">黑星俱樂部</strong> <i class=\"inner_link_account_wechat\">微信号：blackstar2014</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/LrsbrqFDhIbnkcNOicUxBqEGoibm2MDYMGvtibTKcOsX0ibB5QzFoWJicWeehXeeJqU0FpfqlHyI4ictZjHlPzKSZ56g/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">富翁总裁俱乐部</strong> <i class=\"inner_link_account_wechat\">微信号：FWZC6688</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/iaIQI3EXF6xRTD5FtTFxSqn2LwKB2EsJUgK46OVmhc7KYkuK3Plpy7mRw6hngUibat9FtLU5GfVZox4L6SicVbBew/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">FLab健身俱乐部</strong> <i class=\"inner_link_account_wechat\">微信号：未设置</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">服务号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/iaC18vMVA8yNmPf5sESn5LKicss4JPUUK5pzibGUEnomoO1OU269xFFuvo7rdynYV2H5fCHic6hSsU5d3mOfGxKJFw/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">星悦荟时尚俱乐部</strong> <i class=\"inner_link_account_wechat\">微信号：未设置</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">服务号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/YqPLHASicpEfBVKqwoKHUNLotia990UnNdCvNQcYjQN3JqStdJmtKib2gicZ5AzMkK9EnqiaO0CzszErzibHaJoCcpEQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">团一波手游俱乐部</strong> <i class=\"inner_link_account_wechat\">微信号：ECC-sevenwin</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "公众号SERP = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 解析\n",
    "root = fromstring(公众号SERP) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "主 = root.xpath('//li[@class=\"inner_link_account_item\"]')\n",
    "\n",
    "account_list = []\n",
    "for e in 主:\n",
    "    account_nickname = e.xpath('./div/strong[@class=\"inner_link_account_nickname\"]')[0].text\n",
    "    account_wechat = e.xpath('./div/i[@class=\"inner_link_account_wechat\"]')[0].text\n",
    "    account_img = e.xpath('./div/img/@src')[0]\n",
    "    account = {\"nickname\": account_nickname, \"wechat\": account_wechat, \"img\": account_img,}\n",
    "    account_list.append(account)\n",
    "\n",
    "df_account = pd.DataFrame(account_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nickname</th>\n",
       "      <th>wechat</th>\n",
       "      <th>img</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>黑星俱樂部</td>\n",
       "      <td>微信号：blackstar2014</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/ULYj6QDI436mWha...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>富翁总裁俱乐部</td>\n",
       "      <td>微信号：FWZC6688</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/LrsbrqFDhIbnkcN...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>FLab健身俱乐部</td>\n",
       "      <td>微信号：未设置</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/iaIQI3EXF6xRTD5...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>星悦荟时尚俱乐部</td>\n",
       "      <td>微信号：未设置</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/iaC18vMVA8yNmPf...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>团一波手游俱乐部</td>\n",
       "      <td>微信号：ECC-sevenwin</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/YqPLHASicpEfBVK...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    nickname             wechat  \\\n",
       "0      黑星俱樂部  微信号：blackstar2014   \n",
       "1    富翁总裁俱乐部       微信号：FWZC6688   \n",
       "2  FLab健身俱乐部            微信号：未设置   \n",
       "3   星悦荟时尚俱乐部            微信号：未设置   \n",
       "4   团一波手游俱乐部   微信号：ECC-sevenwin   \n",
       "\n",
       "                                                 img  \n",
       "0  http://mmbiz.qpic.cn/mmbiz_png/ULYj6QDI436mWha...  \n",
       "1  http://mmbiz.qpic.cn/mmbiz_png/LrsbrqFDhIbnkcN...  \n",
       "2  http://mmbiz.qpic.cn/mmbiz_png/iaIQI3EXF6xRTD5...  \n",
       "3  http://mmbiz.qpic.cn/mmbiz_png/iaC18vMVA8yNmPf...  \n",
       "4  http://mmbiz.qpic.cn/mmbiz_png/YqPLHASicpEfBVK...  "
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_account"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/ULYj6QDI436mWhatIL7THNaMv60QdmVamV1MIkQmk2E7GOeRaJjpa78VG48HarDet7YsuUaWINQuWVbtMEwk4w/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">黑星俱樂部</strong> <i class=\"inner_link_account_wechat\">微信号：blackstar2014</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div>\n"
     ]
    }
   ],
   "source": [
    "#选择黑星俱樂部\n",
    "element = driver.find_element_by_xpath('//*[@id=\"vue_app\"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/div/div[2]/ul/li[1]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 124]\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "# 跳转上限\n",
    "l_e = driver.find_elements_by_xpath('//label[@class=\"weui-desktop-pagination__num\"]')\n",
    "l_e_int  = [int(x.text) for x in l_e] \n",
    "print (l_e_int)\n",
    "print (l_e_int[0]==l_e_int[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(1,l_e_int[-1]+1 ))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(l_e_int[0],l_e_int[-1]+1 ))\n",
    "#print(pages[0:2])\n",
    "#pages = list(range(1,l_e_int[-1]+1 ))\n",
    "pages = list(range(1,51))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "# global varialbes \n",
    "html_raw = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "\n",
    "        跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "        跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "        跳转_input.clear()\n",
    "        跳转_input.send_keys(p)\n",
    "        跳转_a.click()\n",
    "\n",
    "        time.sleep(15+60*random())\n",
    "\n",
    "        element = driver.find_element_by_xpath('//div[@class=\"inner_link_article_list\"]')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        #print(main_content)\n",
    "        html_raw[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t21\t22\t23\t24\t25\t26\t27\t28\t29\t30\t31\t32\t33\t34\t35\t36\t37\t38\t39\t40\t41\t42\t43\t44\t45\t46\t47\t48\t49\t50\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "1   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "2   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "3   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "4   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "5   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "6   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "7   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "8   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "9   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "10  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "11  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "12  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "13  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "14  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "15  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "16  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "17  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "18  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "19  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "20  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "21  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "22  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "23  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "24  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "25  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "26  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "27  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "28  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "29  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "30  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "31  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "32  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "33  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "34  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "35  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "36  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "37  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "38  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "39  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "40  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "41  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "42  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "43  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "44  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "45  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "46  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "47  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "48  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "49  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "50  <div class=\"weui-desktop-radio-group\"><label c..."
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([html_raw]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stored 'html_raw' (dict)\n"
     ]
    }
   ],
   "source": [
    "%store html_raw\n",
    "import pickle \n",
    "filehandler = open(\"html_raw\", 'wb') \n",
    "pickle.dump(html_raw, filehandler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "49\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "12  <div class=\"weui-desktop-radio-group\"><label c..."
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_out = df[~df.duplicated()]\n",
    "print (len(df_out))\n",
    "df[df.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[12]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[12]"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "try_again = list(df[df.duplicated()].index)\n",
    "print(try_again)\n",
    "try_again = try_again + list (set(pages).difference(set(df.index.values)))\n",
    "try_again"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "fn = { \"output\" : { \"公众号_htm_snippets\": \"公众号_htm_snippets_{公众号}.tsv\",\n",
    "                    \"公众号_df\": \"公众号_df_{公众号}.tsv\",\n",
    "                    \"公众号_xlsx\": \"公众号_url_{公众号}.xlsx\" } \\\n",
    "      }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "公众号 = \"黑星俱樂部\"\n",
    "filename = fn [\"output\"] [\"公众号_htm_snippets\"] \n",
    "df_out.to_csv(filename.format(公众号=公众号), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "from requests_html import HTMLSession"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6,5,5,5,5,5,5,5,5,5,5,5,5,6,5,5,6,5,5,5,5,5,5,5,5,5,5,5,5,6,5,6,6,6,6,5,5,5,6,6,6,5,5,5,6,7,6,5,6,5,"
     ]
    }
   ],
   "source": [
    "def parse_html_snippets(_snippet_):\n",
    "    root = fromstring(_snippet_) \n",
    "    title = [x.text for x in root.xpath('//div[@class=\"inner_link_article_title\"]/span[2]')]\n",
    "    create_time = [x.text for x in root.xpath('//div[@class=\"inner_link_article_date\"]')]\n",
    "    link = [x for x in root.xpath('//a/@href')]\n",
    "    text = [get_text(x) for x in link]\n",
    "    _df_ = pd.DataFrame({\"title\":title, \"create_time\": create_time, \"link\":link,\"text\":text})\n",
    "    return(_df_)\n",
    "\n",
    "def get_text(link):\n",
    "    session = HTMLSession()\n",
    "    r = session.get(url=link)\n",
    "    text_xpath_1 = '//*[@id=\"js_content\"]//span/text()'\n",
    "    text_xpath_2 = '//*[@id=\"js_content\"]//p/text()'\n",
    "    text_1 = ''.join(r.html.xpath(text_xpath_1))\n",
    "    text_2 = ''.join(r.html.xpath(text_xpath_2))\n",
    "    return text_1 + text_2\n",
    "\n",
    "l_df = []\n",
    "for p in pages:\n",
    "    _df_ = parse_html_snippets(df.loc[p,\"html_snippets\"])\n",
    "    print (len(_df_), end=\",\")\n",
    "    l_df.append(_df_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>稻田里拉小提琴的袁隆平是我见过最浪漫的男人</td>\n",
       "      <td>2021-05-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...</td>\n",
       "      <td>blackstarLoveHateDreamLifeWorkPlayEmotionSexar...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>我和NBA球星C.J. McCollum聊了聊（附视频采访）</td>\n",
       "      <td>2021-05-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...</td>\n",
       "      <td>blackstarLoveHateDreamLifeWorkPlayEmotionSexar...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>这场潮牌大秀的设计师团队竟然来自YEEZY和UNDERCOVER！</td>\n",
       "      <td>2021-04-18</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...</td>\n",
       "      <td>blackstarLoveHateDreamLifeWorkPlayEmotionSexar...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>玩土酷的巴黎世家可能最懂中国</td>\n",
       "      <td>2020-08-11</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...</td>\n",
       "      <td>blackstarLoveHateDreamLifeWorkPlayEmotionSexar...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>比罗德曼骚的人还没生出来</td>\n",
       "      <td>2020-08-04</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...</td>\n",
       "      <td>blackstarLoveHateDreamLifeWorkPlayEmotionSexar...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>被社会毒打之前，尽情支持Yamy吧！</td>\n",
       "      <td>2020-07-22</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...</td>\n",
       "      <td>blackstarLoveHateDreamLifeWorkPlayEmotionSexar...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>今天有多少女生为PUA做了免费广告？</td>\n",
       "      <td>2019-12-13</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...</td>\n",
       "      <td>LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>在最好的年华说再见 R.I.P Juice WRLD</td>\n",
       "      <td>2019-12-09</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...</td>\n",
       "      <td>LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>就算你是一个杀手，一样会有小学同学</td>\n",
       "      <td>2019-12-07</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...</td>\n",
       "      <td>LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>谷 歌 地 球 の 浪 漫</td>\n",
       "      <td>2019-12-02</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...</td>\n",
       "      <td>LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>FKA twigs，不会跳舞的歌手不是好神婆</td>\n",
       "      <td>2019-11-25</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...</td>\n",
       "      <td>LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                title create_time  \\\n",
       "0               稻田里拉小提琴的袁隆平是我见过最浪漫的男人  2021-05-23   \n",
       "1      我和NBA球星C.J. McCollum聊了聊（附视频采访）  2021-05-23   \n",
       "2   这场潮牌大秀的设计师团队竟然来自YEEZY和UNDERCOVER！  2021-04-18   \n",
       "3                      玩土酷的巴黎世家可能最懂中国  2020-08-11   \n",
       "4                        比罗德曼骚的人还没生出来  2020-08-04   \n",
       "5                  被社会毒打之前，尽情支持Yamy吧！  2020-07-22   \n",
       "6                  今天有多少女生为PUA做了免费广告？  2019-12-13   \n",
       "7          在最好的年华说再见 R.I.P Juice WRLD  2019-12-09   \n",
       "8                   就算你是一个杀手，一样会有小学同学  2019-12-07   \n",
       "9                       谷 歌 地 球 の 浪 漫  2019-12-02   \n",
       "10             FKA twigs，不会跳舞的歌手不是好神婆  2019-11-25   \n",
       "\n",
       "                                                 link  \\\n",
       "0   http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...   \n",
       "1   http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...   \n",
       "2   http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...   \n",
       "3   http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...   \n",
       "4   http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...   \n",
       "5   http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...   \n",
       "6   http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...   \n",
       "7   http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...   \n",
       "8   http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...   \n",
       "9   http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...   \n",
       "10  http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...   \n",
       "\n",
       "                                                 text  \n",
       "0   blackstarLoveHateDreamLifeWorkPlayEmotionSexar...  \n",
       "1   blackstarLoveHateDreamLifeWorkPlayEmotionSexar...  \n",
       "2   blackstarLoveHateDreamLifeWorkPlayEmotionSexar...  \n",
       "3   blackstarLoveHateDreamLifeWorkPlayEmotionSexar...  \n",
       "4   blackstarLoveHateDreamLifeWorkPlayEmotionSexar...  \n",
       "5   blackstarLoveHateDreamLifeWorkPlayEmotionSexar...  \n",
       "6   LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...  \n",
       "7   LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...  \n",
       "8   LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...  \n",
       "9   LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...  \n",
       "10  LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...  "
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_url_out.loc[0:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pd.ExcelWriter('黑星俱樂部數據.xlsx',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_url_out.to_excel(writer, sheet_name='1-50页')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>261</th>\n",
       "      <td>说FILA过时的人才是真过时了</td>\n",
       "      <td>2017-04-21</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...</td>\n",
       "      <td>LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>262</th>\n",
       "      <td>你应该学会欣赏女孩子</td>\n",
       "      <td>2017-04-19</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...</td>\n",
       "      <td>LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>263</th>\n",
       "      <td>奶奶的THUG LIFE得等你老了才会懂</td>\n",
       "      <td>2017-04-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...</td>\n",
       "      <td>LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>264</th>\n",
       "      <td>我们找到了破坏共享单车的幕后真凶</td>\n",
       "      <td>2017-04-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...</td>\n",
       "      <td>LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>265</th>\n",
       "      <td>在现实生活中买不起房的你在游戏里就可以？</td>\n",
       "      <td>2017-04-15</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...</td>\n",
       "      <td>LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    title create_time  \\\n",
       "261       说FILA过时的人才是真过时了  2017-04-21   \n",
       "262            你应该学会欣赏女孩子  2017-04-19   \n",
       "263  奶奶的THUG LIFE得等你老了才会懂  2017-04-17   \n",
       "264      我们找到了破坏共享单车的幕后真凶  2017-04-16   \n",
       "265  在现实生活中买不起房的你在游戏里就可以？  2017-04-15   \n",
       "\n",
       "                                                  link  \\\n",
       "261  http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...   \n",
       "262  http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...   \n",
       "263  http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...   \n",
       "264  http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...   \n",
       "265  http://mp.weixin.qq.com/s?__biz=MzA3NTM1NTUzMg...   \n",
       "\n",
       "                                                  text  \n",
       "261  LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...  \n",
       "262  LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...  \n",
       "263  LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...  \n",
       "264  LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...  \n",
       "265  LoveHateDreamLifeWorkPlayEmotionSexarts／fashio...  "
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out.tail(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
